# NY Uber datasetAll data was taken from fivethirtyeight's data:https://github.com/fivethirtyeight/uber-tlc-foil-responseAll data was taken from fivethirtyeight's data:
import pandas as pdimport matplotlib.pyplot as pltimport numpy as npimport networkx as nxDATA_PATH = 'data/'raw = pd.read_csv(DATA_PATH+'uber-raw-data-janjune-15.csv',parse_dates=['Pickup_date'])locID = pd.read_csv(DATA_PATH+'taxi-zone-lookup.csv')raw.head()plt.hist(raw['locationID'],bins = len(raw['locationID'].unique()));locID.head()print('Total zones: {}'.format(len(locID['LocationID'].unique())))print('Zones represented in Uber dataset: {}'.format(len(raw['locationID'].unique())))print(list(set(raw['locationID'].unique()) - set(locID['LocationID'].unique())))import googlemapsgmaps = googlemaps.Client(key = 'AIzaSyC8Ipa1zwhPJn9QDUm2dsZp11pdHRXy6aw')search = locID['Zone'][0]print(search)geocode_result = gmaps.geocode(search)geocode_resultdirections_result = gmaps.directions(search,'astoria park',mode='driving',units = 'metric', departure_time='now')directions_resultx
res = gmaps.distance_matrix([search,'Statue of Liberty'],['astoria park','Alphabet City, NY'],mode='driving',units = 'metric', departure_time='now')resfull_dict = {}for i,s in enumerate(locID['Zone']): dict_tmp = {} loc = locID['LocationID'][i] search = s + ', ' + locID['Borough'][i] #geocode_result = gmaps.geocode(search,region='us') dict_tmp['LocationID'] = loc dict_tmp['Borough'] = locID['Borough'][i] dict_tmp['Zone'] = s dict_tmp['lat'] = geocode_result[0]['geometry']['location']['lat'] dict_tmp['lng'] = geocode_result[0]['geometry']['location']['lng'] full_dict[i]=dict_tmp gpsData = pd.DataFrame.from_dict(full_dict,orient='index')gpsData.to_csv(DATA_PATH+'gps_zones.csv')gpsData.head()import foliumgpsData = pd.read_csv(DATA_PATH+'gps_zones.csv')gpsData = gpsData.drop('Unnamed: 0',axis=1)gpsData['Counts'] = raw['locationID'].value_counts().reindex(gpsData.LocationID.unique(), fill_value=0).values#look at the top 50 zones by occurence in the datasettop50 = gpsData.sort_values(by='Counts',ascending=False).head(50)['LocationID'].valuesx
m = folium.Map( location=[40.7, -73.9], zoom_start=11, tiles='Stamen Terrain')for i in range(len(gpsData)): lat = gpsData.loc[i]['lat'] lng = gpsData.loc[i]['lng'] zone = gpsData.loc[i]['Zone'] loc = gpsData.loc[i]['LocationID'] counts = gpsData.loc[i]['Counts'] if loc in top50: folium.Marker( [lat,lng], popup='<i> {} </i>'.format(zone), tooltip=counts, icon=folium.Icon(color='red', icon='info-sign') ).add_to(m) else: folium.Marker([lat,lng], popup='<i> {} </i>'.format(zone), tooltip=counts ).add_to(m)#The top 50 busiest locations are represented on the mapm# Taking it from thereFind a way to smartly query distances:Options: - 1. reducing the number of nodes by grouping geographically - 2. reducing the number of nodes by only taking the top 100 or so nodes - 3. querying for a set of central nodes, the distances from each node to their closest node and trying to correct the errors with L2 distance, or triangulation - 4. defining passing nodes (bridges) that have to be crossed to change borroughs. We can then query distances from and to these nodes. Each node queries 3 closest passing nodes from itself. Find a way to smartly query distances: Options:
- 1. reducing the number of nodes by grouping geographically
- 2. reducing the number of nodes by only taking the top 100 or so nodes
- 3. querying for a set of central nodes, the distances from each node to their closest node and trying to correct the errors with L2 distance, or triangulation
- 4. defining passing nodes (bridges) that have to be crossed to change borroughs. We can then query distances from and to these nodes. Each node queries 3 closest passing nodes from itself. # todo figure out graph datafrom pandas.plotting import register_matplotlib_convertersregister_matplotlib_converters()plt.hist(raw.Pickup_date,bins=30);ordered.head()raw.head()grouped = raw.groupby(by=[raw.Pickup_date.map(lambda x : (x.month,x.day,x.hour)),raw.locationID])a = pd.DataFrame(grouped.size())a.reset_index(inplace=True)a = a.pivot(index='Pickup_date', columns='locationID',values=0).fillna(0)a.to_csv(DATA_PATH+'occurences.csv')a.head()occ = pd.read_csv(DATA_PATH+'occurences.csv')hourly_mean = occ.groupby(by=[occ.Pickup_date.map(lambda x : int(x[-3:-1]))]).mean()plt.bar(range(len(hourly_mean)),hourly_mean.mean(axis=1))plt.title('Mean Uber use per zone in function of the hour')hourly_meanj = 4print('MSE using hourly mean {}'.format((occ[24*j:24*(j+1)].reset_index() - hourly_mean).pow(2).sum().sum()))from scipy.spatial import distance_matrixgpsData.head()x
dist = pd.DataFrame(distance_matrix(gpsData[['lat','lng']].values,gpsData[['lat','lng']].values), index=gpsData.LocationID, columns=gpsData.LocationID)dist.head()distMap = np.array(dist)distMap = np.power(distMap*100,-2)distMap[distMap == np.inf] = 1G = nx.from_numpy_matrix(distMap)G.remove_node(263)G.remove_node(264)nx.draw(G)G.get_edge_data(1,2)gpsData = pd.read_csv(DATA_PATH+'gps_zones.csv')gpsData = gpsData.drop('Unnamed: 0',axis=1)gpsData['Counts'] = raw['locationID'].value_counts().reindex(gpsData.LocationID.unique(), fill_value=0).valuesordered = gpsData.sort_values(by='Counts',ascending=False)plt.bar(range(len(ordered)),ordered['Counts'])plt.title('Counts per zones')gpsData = gpsData[gpsData['Counts']>100]gpsData = gpsData.drop(264)len(gpsData)m = folium.Map( location=[40.7, -73.9], zoom_start=11, tiles='Stamen Terrain')for i in range(len(gpsData)): lat = gpsData.iloc[i]['lat'] lng = gpsData.iloc[i]['lng'] zone = gpsData.iloc[i]['Zone'] loc = gpsData.iloc[i]['LocationID'] counts = gpsData.iloc[i]['Counts'] folium.Marker( [lat,lng], popup='<i> {} </i>'.format(zone), tooltip=loc, icon=folium.Icon(color='red', icon='info-sign') ).add_to(m)mtmp = gpsData[gpsData['Borough']=='Manhattan']southMan = tmp[tmp['lat'] <= float(tmp[tmp['LocationID']==161]['lat'].values)]northMan = tmp[tmp['lat'] >= float(tmp[tmp['LocationID']==170]['lat'].values)]print(len(southMan),len(northMan))bronx = gpsData[gpsData['Borough']=='Bronx']print(len(bronx))tmp = gpsData[gpsData['Borough']=='Brooklyn']southBrook = tmp[tmp['lat'] <= float(tmp[tmp['LocationID']==188]['lat'].values)]northBrook = tmp[tmp['lat'] >= float(tmp[tmp['LocationID']==85]['lat'].values)]print(len(southBrook),len(northBrook))tmp = gpsData[gpsData['Borough']=='Queens']southQueens = tmp[tmp['lat'] <= float(tmp[tmp['LocationID']==82]['lat'].values)]northQueens = tmp[tmp['lat'] >= float(tmp[tmp['LocationID']==121]['lat'].values)]print(len(southQueens),len(northQueens))staten = gpsData[gpsData['Borough']=='Staten Island']print(len(staten))EWR = gpsData[gpsData['Borough']=='EWR']print(len(EWR))#upper triangle of matrixl = np.array([len(southMan),len(northMan),len(bronx),len(southBrook),len(northBrook),len(southQueens),len(northQueens),len(staten)])print('Number of queries: {}'.format((np.power(l,2).sum() - l.sum())/2))tmp= northQueensm = folium.Map( location=[40.7, -73.9], zoom_start=11, tiles='Stamen Terrain' )for i in range(len(tmp)): lat = tmp.iloc[i]['lat'] lng = tmp.iloc[i]['lng'] zone = tmp.iloc[i]['Zone'] loc = tmp.iloc[i]['LocationID'] counts = tmp.iloc[i]['Counts'] folium.Marker( [lat,lng], popup='<i> {} </i>'.format(zone), tooltip=loc, icon=folium.Icon(color='red', icon='info-sign') ).add_to(m) tmp= southQueens for i in range(len(tmp)): lat = tmp.iloc[i]['lat'] lng = tmp.iloc[i]['lng'] zone = tmp.iloc[i]['Zone'] loc = tmp.iloc[i]['LocationID'] counts = tmp.iloc[i]['Counts'] folium.Marker( [lat,lng], popup='<i> {} </i>'.format(zone), tooltip=loc, icon=folium.Icon(color='green', icon='info-sign') ).add_to(m)mmlen(gpsData)from itertools import productfrom tqdm import tqdm_notebook as tqdmtime = 1577005200 # dec 22, 2019, 4 am ny time#sketchy way of getting a matrix the right sizedf = pd.DataFrame(distance_matrix(gpsData[['lat','lng']].values,gpsData[['lat','lng']].values), index=gpsData.LocationID, columns=gpsData.LocationID)#setting all unknown valuesfor col in df.columns: df[col].values[:] = np.inf print(df.shape)#setting diagonalfor i in df.index: df[i][i]= 0 dfDist = df.copy()dfTime = df.copy()dfTraffic = df.copy()Type Markdown and LaTeX:
#southMan --> done 1225 (600 queries)#northMan --> done 1521 (588 queries)#bronx --> done 1764 (970 queries)#southQueens --> done 1296#northQueens --> done 1444#southBrook --> done 1024#northBrook --> done 1444#staten --> done 196for i,j in tqdm(product(staten.LocationID.values,staten.LocationID.values)): if j>i and df[i][j]==np.inf: tmp = gpsData[gpsData['LocationID']==i]; s1 = (tmp['Zone'].values+', '+tmp['Borough'].values)[0] tmp = gpsData[gpsData['LocationID']==j]; s2 = (tmp['Zone'].values+', '+tmp['Borough'].values)[0] res = gmaps.distance_matrix(s1,s2,mode='driving',units = 'metric', departure_time=time) dfDist[i][j] = res['rows'][0]['elements'][0]['distance']['value'] dfTime[i][j] = res['rows'][0]['elements'][0]['duration']['value'] dfTraffic[i][j] = res['rows'][0]['elements'][0]['duration_in_traffic']['value'] df[i][j] = 1northManBorder1,northQueensBorder1 = [170,233,229,141,194,75,244,120,243],[145,146,157,193,7,179,8,207,138,253,226,112]southManBorder1,northQueensBorder2 = [137,170,233,148,232,209,88],[146,145,193,226,157,138]southManBorder2,northBrookBorder = [232,148,48,209,88,12,261,45],[112,255,256,217,33,65,40,52,257,228]southManBorder3, statenEWRBorder = [125,211,261], [156,251,23,1]statenBorder,EWRBorder = [156,251,23,118],[1]northManBorder2 , bronxBorder1 = [194,75,42,116,244,243,120,127,153],[168,119,247,159,69,220,200,235,212]bronxBorder2,northQueensBorder3 =[168,212,213,242,59,208], [7,179,8,207,138,253,226,93,53,252,9,192]southBrookBorder,statenBorder2 = [14,11,67,228,55],[6,214,221]southBrookBorder2,southQueensBorder = [150,154,222,76,77],[27,201,124,180,132]northBrookBorder2,southQueensBorder2 = [76,77,63,177,37,80,112,255,256],[124,180,258,96,102,198,157,132]northBrookBorder3, northQueensBorder4 = [112,255,256,80,177,63],[196,157,146,138]x
#southMan --> done 1225 (600 queries)#northMan --> done 1521 (588 queries)#bronx --> done 1764 (970 queries)#southQueens --> done 1296#northQueens --> done 1444#southBrook --> done 1024#northBrook --> done 1444#staten --> done 196#northManBorder1,northQueensBorder1#southManBorder1,northQueensBorder2#southManBorder2,northBrookBorder#southManBorder3, statenEWRBorder#statenBorder,EWRBorder#northManBorder2 , bronxBorder1#bronxBorder2,northQueensBorder3#southBrookBorder,statenBorder2#southBrookBorder2,southQueensBorder#northBrookBorder2,southQueensBorder2#northBrookBorder3, northQueensBorder4#now we go forfor i,j in tqdm(product(northBrookBorder3, northQueensBorder4)): i,j = sorted([i,j]) if j>i and df[i][j]==np.inf: tmp = gpsData[gpsData['LocationID']==i]; s1 = (tmp['Zone'].values+', '+tmp['Borough'].values)[0] tmp = gpsData[gpsData['LocationID']==j]; s2 = (tmp['Zone'].values+', '+tmp['Borough'].values)[0] res = gmaps.distance_matrix(s1,s2,mode='driving',units = 'metric', departure_time=time) dfDist[i][j] = res['rows'][0]['elements'][0]['distance']['value'] dfTime[i][j] = res['rows'][0]['elements'][0]['duration']['value'] try: dfTraffic[i][j] = res['rows'][0]['elements'][0]['duration_in_traffic']['value'] except: print('replaced') dfTraffic[i][j] = res['rows'][0]['elements'][0]['duration']['value'] df[i][j] = 1dfDist.to_csv(DATA_PATH+'mapsDistance.csv')dfTime.to_csv(DATA_PATH+'mapsTime.csv')dfTraffic.to_csv(DATA_PATH+'mapsTraffic.csv')dfDist[1][125]x
#create symmetric matrixmat = np.array(dfDist)mat[mat==np.inf]=0mat = mat+mat.Tx
GDist = nx.from_numpy_matrix(mat)len(GDist)nx.is_connected(GDist)length, path = nx.single_source_dijkstra(GDist, 0)length[145]#recalculate shortest distances#create symmetric matrixmat = np.array(dfDist)mat[mat==np.inf]=0mat = mat+mat.TGDist = nx.from_numpy_matrix(mat)distMatrix = np.zeros((len(GDist),len(GDist)))for i in range(len(GDist)): length, path = nx.single_source_dijkstra(GDist, i) for j in length.keys(): distMatrix[i][j] = length[j] distMatrix[j][i] = length[j]x
#time#create symmetric matrixmat = np.array(dfTime)mat[mat==np.inf]=0mat = mat+mat.TGTime = nx.from_numpy_matrix(mat)timeMatrix = np.zeros((len(GTime),len(GTime)))for i in range(len(GTime)): length, path = nx.single_source_dijkstra(GTime, i) for j in length.keys(): timeMatrix[i][j] = length[j] timeMatrix[j][i] = length[j]distMatrix[0,20] #--> corresponds to direct pathdfDist.head(20)gpsData.head(3)res = gmaps.distance_matrix('Alphabet City, Manhattan','Newark Airport, EWR',mode='driving',units = 'metric', departure_time=time)print(res['rows'][0]['elements'][0]['distance']['value'])print(res['rows'][0]['elements'][0]['duration']['value'])gpsDatax
m = folium.Map( location=[40.7, -73.9], zoom_start=11, tiles='Stamen Terrain')for i in range(len(gpsData)): lat = gpsData.iloc[i]['lat'] lng = gpsData.iloc[i]['lng'] zone = gpsData.iloc[i]['Zone'] loc = gpsData.iloc[i]['LocationID'] counts = gpsData.iloc[i]['Counts'] folium.Marker( [lat,lng], popup='<i> {} </i>'.format(zone), tooltip=counts, icon=folium.Icon(color='red', icon='info-sign') ).add_to(m) for x, y in GDist.edges(): points = [(gpsData['lat'].iloc[x], gpsData['lng'].iloc[x]),(gpsData['lat'].iloc[y], gpsData['lng'].iloc[y])] edge = folium.PolyLine(locations=points, weight=1, color='blue') edge.add_to(m)mgpsData['lat'].iloc[x]